import pandas as pd
df = pd.read_csv(r'C:\Machine Learning\Datasets\weatherAUS.csv')
print(df)
Date Location MinTemp MaxTemp Rainfall Evaporation \
0 2008-12-01 Albury 13.4 22.9 0.6 NaN
1 2008-12-02 Albury 7.4 25.1 0.0 NaN
2 2008-12-03 Albury 12.9 25.7 0.0 NaN
3 2008-12-04 Albury 9.2 28.0 0.0 NaN
4 2008-12-05 Albury 17.5 32.3 1.0 NaN
... ... ... ... ... ... ...
145455 2017-06-21 Uluru 2.8 23.4 0.0 NaN
145456 2017-06-22 Uluru 3.6 25.3 0.0 NaN
145457 2017-06-23 Uluru 5.4 26.9 0.0 NaN
145458 2017-06-24 Uluru 7.8 27.0 0.0 NaN
145459 2017-06-25 Uluru 14.9 NaN 0.0 NaN
Sunshine WindGustDir WindGustSpeed WindDir9am ... Humidity9am \
0 NaN W 44.0 W ... 71.0
1 NaN WNW 44.0 NNW ... 44.0
2 NaN WSW 46.0 W ... 38.0
3 NaN NE 24.0 SE ... 45.0
4 NaN W 41.0 ENE ... 82.0
... ... ... ... ... ... ...
145455 NaN E 31.0 SE ... 51.0
145456 NaN NNW 22.0 SE ... 56.0
145457 NaN N 37.0 SE ... 53.0
145458 NaN SE 28.0 SSE ... 51.0
145459 NaN NaN NaN ESE ... 62.0
Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am \
0 22.0 1007.7 1007.1 8.0 NaN 16.9
1 25.0 1010.6 1007.8 NaN NaN 17.2
2 30.0 1007.6 1008.7 NaN 2.0 21.0
3 16.0 1017.6 1012.8 NaN NaN 18.1
4 33.0 1010.8 1006.0 7.0 8.0 17.8
... ... ... ... ... ... ...
145455 24.0 1024.6 1020.3 NaN NaN 10.1
145456 21.0 1023.5 1019.1 NaN NaN 10.9
145457 24.0 1021.0 1016.8 NaN NaN 12.5
145458 24.0 1019.4 1016.5 3.0 2.0 15.1
145459 36.0 1020.2 1017.9 8.0 8.0 15.0
Temp3pm RainToday RainTomorrow
0 21.8 No No
1 24.3 No No
2 23.2 No No
3 26.5 No No
4 29.7 No No
... ... ... ...
145455 22.4 No No
145456 24.5 No No
145457 26.1 No No
145458 26.0 No No
145459 20.9 No NaN
[145460 rows x 23 columns]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 145460 entries, 0 to 145459 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 145460 non-null object 1 Location 145460 non-null object 2 MinTemp 143975 non-null float64 3 MaxTemp 144199 non-null float64 4 Rainfall 142199 non-null float64 5 Evaporation 82670 non-null float64 6 Sunshine 75625 non-null float64 7 WindGustDir 135134 non-null object 8 WindGustSpeed 135197 non-null float64 9 WindDir9am 134894 non-null object 10 WindDir3pm 141232 non-null object 11 WindSpeed9am 143693 non-null float64 12 WindSpeed3pm 142398 non-null float64 13 Humidity9am 142806 non-null float64 14 Humidity3pm 140953 non-null float64 15 Pressure9am 130395 non-null float64 16 Pressure3pm 130432 non-null float64 17 Cloud9am 89572 non-null float64 18 Cloud3pm 86102 non-null float64 19 Temp9am 143693 non-null float64 20 Temp3pm 141851 non-null float64 21 RainToday 142199 non-null object 22 RainTomorrow 142193 non-null object dtypes: float64(16), object(7) memory usage: 25.5+ MB
df.dropna(subset = ['RainToday','RainTomorrow'], inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 140787 entries, 0 to 145458 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 140787 non-null object 1 Location 140787 non-null object 2 MinTemp 140319 non-null float64 3 MaxTemp 140480 non-null float64 4 Rainfall 140787 non-null float64 5 Evaporation 81093 non-null float64 6 Sunshine 73982 non-null float64 7 WindGustDir 131624 non-null object 8 WindGustSpeed 131682 non-null float64 9 WindDir9am 131127 non-null object 10 WindDir3pm 137117 non-null object 11 WindSpeed9am 139732 non-null float64 12 WindSpeed3pm 138256 non-null float64 13 Humidity9am 139270 non-null float64 14 Humidity3pm 137286 non-null float64 15 Pressure9am 127044 non-null float64 16 Pressure3pm 127018 non-null float64 17 Cloud9am 88162 non-null float64 18 Cloud3pm 84693 non-null float64 19 Temp9am 140131 non-null float64 20 Temp3pm 138163 non-null float64 21 RainToday 140787 non-null object 22 RainTomorrow 140787 non-null object dtypes: float64(16), object(7) memory usage: 25.8+ MB
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] =14
matplotlib.rcParams['figure.figsize'] =(10,6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
df.Location.nunique()
49
import plotly.express as px
px.histogram(df,x='Location',title='Location vs. Rainy Days', color = 'RainToday')
px.histogram(df,x='Temp3pm',title='Tempat3PM vs. Raintomorrow',color='RainTomorrow')
px.histogram(df,x='RainTomorrow',title='RainToday vs. RainTomorrow',color='RainToday')
px.scatter(df.sample(2000),title = 'Min Temp vs Max Temp' , x = 'MinTemp',y='MaxTemp',color='RainToday')
from sklearn.model_selection import train_test_split
train_val_df, test_df = train_test_split(df,test_size=0.2, random_state=42)
train_df,val_df = train_test_split(train_val_df,test_size=0.25,random_state=42)
print('train_df.shape:' , train_df.shape)
print('val_df.shape:' , train_df.shape)
print('test_df.shape:' , train_df.shape)
train_df.shape: (84471, 23) val_df.shape: (84471, 23) test_df.shape: (84471, 23)
plt.title('No of rows per year')
sns.countplot(x=pd.to_datetime(df.Date).dt.year);
year = pd.to_datetime(df.Date).dt.year
train_df = df[year<2015]
val_df = df[year==2015]
test_df = df[year>2015]
input_col = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'
print(input_col)
['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday']
target_col
'RainTomorrow'
train_inputs = train_df[input_col].copy()
train_targets = train_df[target_col].copy()
val_inputs = val_df[input_col].copy()
val_targets = val_df[target_col].copy()
test_inputs = test_df[input_col].copy()
test_targets = test_df[target_col].copy()
train_inputs
| Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | WindDir3pm | ... | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | WNW | ... | 24.0 | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No |
| 1 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | WSW | ... | 22.0 | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No |
| 2 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | WSW | ... | 26.0 | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No |
| 3 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | E | ... | 9.0 | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No |
| 4 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | NW | ... | 20.0 | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 144548 | Uluru | 16.9 | 33.2 | 0.0 | NaN | NaN | SSE | 43.0 | ESE | SSE | ... | 26.0 | 22.0 | 13.0 | 1014.1 | 1009.8 | NaN | NaN | 23.7 | 31.8 | No |
| 144549 | Uluru | 15.1 | 36.8 | 0.0 | NaN | NaN | NE | 31.0 | ENE | SW | ... | 20.0 | 16.0 | 8.0 | 1012.6 | 1007.6 | NaN | NaN | 28.9 | 34.8 | No |
| 144550 | Uluru | 17.3 | 37.8 | 0.0 | NaN | NaN | ESE | 39.0 | ESE | SSE | ... | 9.0 | 15.0 | 8.0 | 1011.9 | 1008.0 | NaN | NaN | 29.7 | 35.7 | No |
| 144551 | Uluru | 20.1 | 38.5 | 0.0 | NaN | NaN | ESE | 43.0 | ESE | SSW | ... | 17.0 | 22.0 | 9.0 | 1014.0 | 1009.2 | NaN | NaN | 29.8 | 37.2 | No |
| 144552 | Uluru | 22.5 | 39.6 | 0.0 | NaN | NaN | WNW | 76.0 | ENE | SSW | ... | 13.0 | 16.0 | 9.0 | 1012.1 | 1006.2 | NaN | NaN | 30.1 | 37.4 | No |
97988 rows × 21 columns
import numpy as np
numeric_col = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_col = train_inputs.select_dtypes('object').columns.tolist()
numeric_col
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
categorical_col
['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
train_inputs[numeric_col].describe()
| MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustSpeed | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 97674.000000 | 97801.000000 | 97988.000000 | 61657.000000 | 57942.000000 | 91160.000000 | 97114.000000 | 96919.000000 | 96936.000000 | 96872.000000 | 88876.000000 | 88857.000000 | 63000.000000 | 61966.000000 | 97414.000000 | 97392.000000 |
| mean | 12.007831 | 23.022202 | 2.372935 | 5.289991 | 7.609004 | 40.215873 | 14.092263 | 18.764608 | 68.628745 | 51.469547 | 1017.513734 | 1015.132352 | 4.302952 | 4.410677 | 16.835126 | 21.540138 |
| std | 6.347175 | 6.984397 | 8.518819 | 3.952010 | 3.788813 | 13.697967 | 8.984203 | 8.872398 | 19.003097 | 20.756113 | 7.072510 | 6.997072 | 2.866634 | 2.693370 | 6.404586 | 6.831612 |
| min | -8.500000 | -4.100000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 980.500000 | 979.000000 | 0.000000 | 0.000000 | -5.900000 | -5.100000 |
| 25% | 7.500000 | 17.900000 | 0.000000 | 2.600000 | 4.800000 | 31.000000 | 7.000000 | 13.000000 | 57.000000 | 37.000000 | 1012.800000 | 1010.400000 | 1.000000 | 2.000000 | 12.200000 | 16.600000 |
| 50% | 11.800000 | 22.400000 | 0.000000 | 4.600000 | 8.500000 | 39.000000 | 13.000000 | 19.000000 | 70.000000 | 52.000000 | 1017.500000 | 1015.100000 | 5.000000 | 5.000000 | 16.600000 | 20.900000 |
| 75% | 16.600000 | 27.900000 | 0.800000 | 7.200000 | 10.600000 | 48.000000 | 19.000000 | 24.000000 | 83.000000 | 66.000000 | 1022.300000 | 1019.900000 | 7.000000 | 7.000000 | 21.400000 | 26.200000 |
| max | 33.900000 | 48.100000 | 371.000000 | 82.400000 | 14.300000 | 135.000000 | 87.000000 | 87.000000 | 100.000000 | 100.000000 | 1041.000000 | 1039.600000 | 9.000000 | 9.000000 | 40.200000 | 46.100000 |
train_inputs[categorical_col].describe()
| Location | WindGustDir | WindDir9am | WindDir3pm | RainToday | |
|---|---|---|---|---|---|
| count | 97988 | 91120 | 90969 | 96036 | 97988 |
| unique | 49 | 16 | 16 | 16 | 2 |
| top | Canberra | W | N | SE | No |
| freq | 2506 | 6672 | 8012 | 7603 | 76002 |
train_inputs[categorical_col].nunique()
Location 49 WindGustDir 16 WindDir9am 16 WindDir3pm 16 RainToday 2 dtype: int64
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'mean')
df[numeric_col].isna().sum()
MinTemp 468 MaxTemp 307 Rainfall 0 Evaporation 59694 Sunshine 66805 WindGustSpeed 9105 WindSpeed9am 1055 WindSpeed3pm 2531 Humidity9am 1517 Humidity3pm 3501 Pressure9am 13743 Pressure3pm 13769 Cloud9am 52625 Cloud3pm 56094 Temp9am 656 Temp3pm 2624 dtype: int64
imputer.fit(df[numeric_col])
SimpleImputer()
list(imputer.statistics_)
[12.18482386562048, 23.235120301822324, 2.349974074310839, 5.472515506887154, 7.630539861047281, 39.97051988882308, 13.990496092519967, 18.631140782316862, 68.82683277087672, 51.44928834695453, 1017.6545771543717, 1015.2579625879797, 4.431160817585808, 4.499250233195188, 16.98706638787991, 21.69318269001107]
train_inputs[numeric_col] = imputer.transform(train_inputs[numeric_col])
val_inputs[numeric_col] = imputer.transform(val_inputs[numeric_col])
test_inputs[numeric_col] = imputer.transform(test_inputs[numeric_col])
train_inputs[numeric_col].isna().sum()
MinTemp 0 MaxTemp 0 Rainfall 0 Evaporation 0 Sunshine 0 WindGustSpeed 0 WindSpeed9am 0 WindSpeed3pm 0 Humidity9am 0 Humidity3pm 0 Pressure9am 0 Pressure3pm 0 Cloud9am 0 Cloud3pm 0 Temp9am 0 Temp3pm 0 dtype: int64
from sklearn.preprocessing import oneHotEncoder
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) Input In [78], in <cell line: 1>() ----> 1 from sklearn.preprocessing import oneHotEncoder ImportError: cannot import name 'oneHotEncoder' from 'sklearn.preprocessing' (C:\Users\Vikas Yadav\anaconda3\lib\site-packages\sklearn\preprocessing\__init__.py)